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1. Identify the computation bulk in Devito kernels 

2. Translate it into an OPS kernel 


11 



step 1 - Identify the computation bulk in Devito kernels 


exemple.py 


from devito import Eq, Grid, TimeFunction, Operator 
grid_2d = Grid(shape=(4, 4)) 

v = TimeFunction(name= 1 v' , grid=grid_2d, time_order=2, save=10) 
equation = Eq(v.forward, v+1) 
operator = Operator(equation) 


print (operator) 



#define _POSIX_C_SOURCE 200809L 
#include "stdlib.h" 

#include "math.h" 

#include "sys/time.h" 

#include "xmmintrin.h" 

#include "pmmintrin.h" 

struct dataobj { 

void *restrict data; 

int * size; 

int * npsize; 

int * dsize; 

int * hsize; 

int * hofs; 

int * oofs; 

}; 

struct profiler { 
double section0; 

}; 


int Kernel(struct dataobj *restrict v_vec, const int time_M, const int time_m, struct profiler * timers, const int x_M, const int x_m, const int 
y_M, const int y_m) { 

float (*restrict v) [v_vec->size[1] ] [v_vec->size[2] ]_attribute_((aligned (64))) = (float (*) [v_vec->size[1] ][v_vec->size[2] ]) v_vec->data; 

/* Flush denormal numbers to zero in hardware */ 

_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON) ; 

_MM_SET_FLUSH_ZERO_MODE( _MM_FLUSH_ZERO_ON) ; 
for (int time = time_m; time <= time_M; time += 1){ 
struct timeval start_section0, end_section@; 
gettimeofday(&start_section@, NULL) ; 

/* Begin section© */ 

for (int x = x_m; x <= x_M; x += 1) { 

#pragma omp simd aligned(v : 32 ) 

for (int y = y_m; y <= y_M; y += 1) { 

v[time + 1][x + 1][y + 1j = v[time][x + 1][y + 1] +1; 

} 

} 

/* End section0 */ 

gettimeofday(&end_section@, NULL) ; 

timers->section0 += (double)(end_section@ . tv_sec-start_section@.tv_sec) + (double)(end_section0.tv_usec-start_section0 . tv_usec) /1000000; 

} 

return 0; 

} 




#define _POSIX_C_SOURCE 200809L 
#include "stdlib.h" 

#include "math.h" 

#include "sys/time.h" 


struct dataobj { 

void *restrict data; 

int * size; 

int * npsize; 

int * dsize; 

int * hsize; 

int * hofs; 

int * oofs; 

}; 

struct profiler { 
double section0; 

}; 


int Kernel(struct dataobj *restrict v_vec, const int time_M, const int time_m, const int x_M, const int x_m, const int y_M, const int y_m) { 
float (*restrict v)[v_vec->size[1]] [v_vec->size[2] ] _attribute_ ((aligned (64))) = (float (*) [v_vec->size[1] ][v_vec->size[2]]) v_vec->data; 


for (int time = time_m; time <= time_M; time += 1){ 
struct timeval start_section0, end_section@; 
gettimeofday(&start_section0, NULL) ; 

for (int x = x_m; x <= x_M; x += 1) ( 

for (int y = y_m; y <= y_M; y += 1) { 

v[time + 1][x + 1][y + 1] = v[time][x + 1][y + 1] + 1; 

} 

} 

gettimeofday(&end_section@, NULL) ; 

timers->section@ += (double)(end_section@ . tv_sec-start_section@.tv_sec) + (double)(end_section@ . tv_usec-start_section0 . tv_usec) /1000000 ; 

} 

return 0; 


} 



#define _POSIX_C_SOURCE 200809L 
#include "stdlib.h" 

#include "math.h" 


struct dataobj { 

void *restrict data; 

int * size; 

int * npsize; 

int * dsize; 

int * hsize; 

int * hofs; 

int * oofs; 

}; 


int Kernel(struct dataobj *restrict v_vec, const int time_M, const int time_m, const int x_M, const int x_m, const int y_M, const int y_m) { 
float (*restrict v)[v_vec->size[1]] [v_vec->size[2] ] _attribute_ ((aligned (64))) = (float (*) [v_vec->size[1] ][v_vec->size[2]]) v_vec->data; 


for (int time = time_m; time <= time_M; time += 1){ 


for (int x = x_m; x <= x_M; x += 1 ) { 

for (int y = y_m; y <= y_M; y += 1) { 

v[time + 1][x + 1][y + 1] = v[time][x + 1][y + 1] + 1; 

} 

} 


} 

return 0; 

} 



#define _POSIX_C_SOURCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, const int x_m, const int y_M, const int y_m) 

{ 


for (int time = time_m; time <= time_M; 


time += 1){ 


for (int x = x_m; x <= x_M; x 

for (int y = y_m; y <= y_M; 
v[time + 1][x + 1][y + 1] 

} 


+= 1) < 
y += 1) ( 

= v[time][x + 1][y + 1] 


1 ; 


} 

return 0; 



} 



#define _POSIX_C_SOURCE 200809L 
#include "stdlib.h" 

#include "math.h" 

int Kernel(float * v, const int const int time_m, const int x_M, const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{ 

for (int x = x_m; x <= x_M; x += 1) 

{ 

for (int y = y_m; y <= y_M; y += 1) 

{ 

v[time + 1 ][x + 1][y + 1] = v[time][x + 1][y + 1] +1; 

} 

} 

} 

return 0; 


} 



step 1 - Identify the computation bulk in Devito kernels 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{ 

for (int x = x_m; x <= x_M; x += 1) 

{ 

for (int y = y_m; y <= y_M; y += 1) 

{ 

v[time + 1][x + 1][y + 1] = v[time][x + 1][y + 1] + 1; 

} 

} 

} 

return 0; 


18 




step 1 - Identify the computation bulk in Devito kernels 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{ 

for (int x = x_m; x <= x_M; x += 1) 

{ Eq(v.forward, v+1) 

for (int y = y_m; y <= y_M; y += 1) 

{ 

v[time + 1][x + 1][y + 1] = v[time][x + 1][y + 1] + 1; 

} 

} 

} 

return 0; 
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step 1 - Identify the computation bulk in Devito kernels 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 


for (int time = time_m; time <= time_M; time += 1) 






1 . 
2 . 


Identify the computation bulk in Devito kernels 
Translate it into an OPS kernel 
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step 2 - Translation to OPS kernels 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int const int time_m, const int x_M, 

const int x_m, const int y_M, const int y_m) 

{ 


for (int time = time_m; time <= time_M; time += 1) 






step 2 - Translation to OPS kernels 


I 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c| 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 

int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 



return 0; 


} 
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1. Identify the computation bulk in 

2. Translate it into an OPS kernel 

i. Preparation 

ii. Declarations 

iii. Invocation 


Devito kernels 
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step 2 - Translation to OPS kernels 
iii. Invocation 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 

int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{. - 

for (int x = x_m; x <= x_M; x += 1) 

{ 

for (int y = y_m; y <= y_M; y += 1) 

{ 

v[time + 1][x + 1][y + 1] = v[time] 

} 

> } 

return 0; 



} 
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step 2 - Translation to OPS kernels 
iii. Invocation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int const int time_m, const int x_M, 

const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 


1 


ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[t1], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 


} 

return 0; 





step 2 - Translation to OPS kernels 
iii. Invocation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int const int time_m, const int x_M, 

const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 


1 


ops_par_loop(OPS_Kernel_0 "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_aat^v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[t1], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 


} 

return 0; 





step 2 - Translation to OPS kernels 
iii. Invocation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int const int time_m, const int x_M, 

const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 


1 


ops_par_loop(OPS_Kernel_P " nDC _ u ' o, -nel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_aat v_dat[t0] 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg.dat v_dat[t1 ] 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 


} 

return 0; 





step 2 - Translation to OPS kernels 
iii. Invocation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int const int time_m, const int x_M, 

const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 


1 


ops_par_loop(OPS_Kernel_P " ODC _ l/Q, 'nel.. c >" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_aat v_dat[t0] 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat v_dat[t1 ] 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 


} 

return 0; 





step 2 - Translation to OPS kernels 
iii. Invocation 


exemple.py 


from devito import Eq, Grid, TimeFunction, Operator 
grid_2d = Grid(shape=(4, 4)) 

v = TimeFunction(name= 'v' , grid=grid_2d, time_order=2, save=10) 
equation = Eq(v.forward, v+1 ) 
operator = Operator(equation) 


print (operator) 




just a representation! 
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step 2 - Translation to OPS kernels 
iii. Invocation 


exemple.py 


from devito import Eq, Grid, TimeFunction, Operator 
grid_2d = Grid(shape=(4, 4)) 

v = TimeFunction(name= 1 v' , grid=grid_2d, time_order=2, save=10) 
equation = Eq(v.forward, v+1) 
operator = Operator(equation) 

print (operator.view) 
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<Callable Kernel> 

<List (0, 2, 0)> 

<ArrayCast> 

<List (3, 1, 0)> 

<C.Comment /* Flush denormal numbers to zero in hardware */> 

<C.Statement _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);> 

<C.Statement _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);> 

<List (0, 1, 0)> 

<[affine, sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 
<TimedList (2, 1, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1) : : (0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1): : (0, 0)> 
<ExpressionBundle (1)> 

<Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/1000000> 


<Callable Kernel> 

<List (0, 2, 0)> 

<ArrayCast> 

<List (3, 1, 0)> 

<C.Comment /* Flush denormal numbers to zero in hardware */> 

<C.Statement _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON) ;> 

<C.Statement _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);> 

<List (0, 1, 0)> 

<[affine,sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 
<TimedList (2, 1, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1)::(0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1)::(0, 0)> 
<ExpressionBundle (1)> 

<Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/I000000> 



<Callable Kernel> 

<List (0, 2, 0)> 

<ArrayCast> 

<List (3, 1, 0)> 

<C.Comment /* Flush denormal numbers to zero in hardware */> 

<C.Statement _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON) ;> 

<C.Statement _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);> 

<List (0, 1, 0)> 

<[affine,sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 
<TimedList (2, 1, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1)::(0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1) : : (0, 0)> 
<ExpressionBundle (1)> 

<Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/I000000> 





<Callable Kernel> 

<List (0, 2, 0)> 

< ArrayCast> _ 

<List (3, 1, 0)> 

<C.Comment /* Flush denormal numbers to zero in hardware */> 

<C.Statement _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON) ;> 

<C.Statement _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);> 

<List (0, 1, 0)> 

<[affine,sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 
<TimedList (2, 1, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1)::(0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1)::(0, 0)> 
<ExpressionBundle (1)> 

Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/I000000> 










<Callable Kernel> 

<List (0, 2, 0)> 

<A rrayCast> _ 

<List (3, 1, 0)> 

<C.Comment 7 * Flush denormal numbers to zero in hardware */> 

<C : Statement MM .SET DENORMALS ZERO MODEf MM DENORKALS ZERO ON/) :> 

pList (0, 1, 0)> 

<[affine,sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 
<TimedList (2, 1, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1) : : (0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1)::(0, 0)> 
<ExpressionBundle (1)> 

<Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/I000000> 











<Callable Kernel> 

<List (0, 2, 0)> 

<A rrayCast> _ 

<List (3, 1, 0)> 

<C.Comment 7 * Flush denormal numbers to zero in hardware */> 

<C : Statement MM .SET DENORMALS ZERO MODE ( MM DENORKALS ZERO ON/) :> 

pList (0, 1, 0)> 

<[affine,sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 

< I imedList (2, T, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1) : : (0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1): : (0, 0)> 
<ExpressionBundle (1)> 

Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/1000000; 















<Callable Kernel> 
pList (0, 2, 0)> 

<A rrayCast> _ 

<List (3, 1, 0)> 

<C.Comment 7 * Flush denormal numbers to zero in hardware */> 

<C : Statement MM .SEJ DENORMALS ZERO MODE ( MM DENORKALS ZERO ON/) :> 

pList (0, 1, 0)> 

<[affine,sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 

< I imedList (2, T, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1) : : (0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1): : (0, 0)> 
<ExpressionBundle (1)> 

Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/1000000; 

















<Callable Kernel> 



<Callable Kernel> 


ii ii ii 

A callable function. 
Parameters 


name : str 

The name of the callable, 
body : Node or list of Node 
The Callable body, 
retval : str 

The return type of Callable, 
parameters : list of Basic, optional 

The objects in input to the Callable, 
prefix : list of str, optional 

Qualifiers to prepend to the Callable signature. Defaults to ''('static', 
'inline')''. 

ii ii ii 


devito/devito/ir/iet/nodes.py 




<Callable Kernel> 


A callable function. 
Parameters 


name : str 'Kernel' 

The name of the callable. 
body : Node or list of Node 
The Callable body, 
retval : str 

The return type of Callable, 
parameters : list of Basic, optional 

The objects in input to the Callable, 
prefix : list of str, optional 

Qualifiers to prepend to the Callable signature. Defaults to 
1 inline 1 )''. 


('static', 


devito/devito/ir/iet/nodes.py 




<Callable Kernel> 


A callable function. 
Parameters 


<List (0,2,0)> 


name : str * Kernel 

The name of the callable, 
body : Node or list of Node 
The Callable body, 
retval : str 

The return type of Callable, 
parameters : list of Basic, optional 

The objects in input to the Callable, 
prefix : list of str, optional 

Qualifiers to prepend to the Callable signature. Defaults to 
'inline')''. 


('static' , 


devito/devito/ir/iet/nodes.py 





<Callable Kernel> 


A callable function. 
Parameters 


name : str 

The name of the callable, 
body : Node or list of Node 
The Callable body, 
retval : str 

The return type of Callable, 
parameters : list of Basic, optional 

The objects in input to the Callable, 
prefix : list of str, optional 

Qualifiers to prepend to the Callable 
'inline')''. 

ii ii ii 


' Kernel ' 


<List (0,2,0)> 


'int ' 


signature. Defaults to 


('static' , 


devito/devito/ir/iet/nodes.py 





<Callable Kernel> 


A callable function. 
Parameters 


name : str 

The name of the callable, 
body : Node or list of Node 
The Callable body, 
retval : str 

The return type of Callable, 
parameters : list of Basic, optional 

The objects in input to the Callable, 
prefix : list of str, optional 

Qualifiers to prepend to the Callable 
'inline')''. 

ii ii ii 


' Kernel ' 


<List (0,2,0)> 


'int ' 


e 

i 

X 

X 

i 

E 

1 

X 

y_M 

time_m 

time_M 


signature. Defaults to ''('static', 


devito/devito/ir/iet/nodes.py 










step 2 - Translation to OPS kernels 
iii. Invocation 
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step 2 - Translation to OPS kernels 
iii. Invocation 
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step 2 - Translation to OPS kernels 
iii. Invocation 


ops_par_loop(OPS_Kernel_0, "OPS_Kernel. 

_0", block, 2, 

{x_m, x. 

_M, y_m, y_M}, 

ops_arg_dat (v_dat [t0], 1, 

S2D_VT0_1PT, 

"float", 

OPS.READ), 

ops_arg_dat(v_dat[tl], 1, 

S2D_VT1_1PT, 

"float", 

0PS_WRITE)); 


i 
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step 2 - Translation to OPS kernels 
iii. Invocation 


<Call> 

"""A function call. 


name : str 

The name of the callable, 
parameters : list of Basic, optional 

The objects in input to the Callable. 
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step 2 - Translation to OPS kernels 
iii. Invocation 



<Call> 

"""A function call. 


name : str ' ops_par_loop ' 

The name of the callable. 
parameters : list of Basic, optional 

The objects in input to the Callable. 
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step 2 - Translation to OPS kernels 
iii. Invocation 


<Call> 

"""A function call. 


name : str ' ops_par_loop ' 

The name of the callable. 
parameters : list of Basic, optional 

The objects in input to the Callable. 
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step 2 - Translation to OPS kernels 
iii. Invocation 


fincLaffine_trees(iet) 


52 



step 2 - Translation to OPS kernels 
iii. Invocation 


find_affine_trees(iet).items() 
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step 2 - Translation to OPS kernels 
iii. Invocation 


list(find_affine_trees(iet).items())[0] 
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step 2 - Translation to OPS kernels 
iii. Invocation 


section, trees = list(find_affine_trees(iet).items())[0] 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
mapper[trees[0].root] 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
mapper[trees[0].root] = Call( 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
mapper[trees[0].root] = Call(' ops_par_loop ', 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
mapper[trees[0].root] = Call(' ops_par_loop ', f ) 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
mapper[trees[0].root] = Call(' ops_par_loop ', j ) 

T ransformer 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
mapper[trees[0].root] = Call(' ops_par_loop ', j ) 

T ransformer(mapper) 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
mapper[trees[0].root] = Call(' ops_par_loop ', j ) 

Transformer(mapper).visit(iet) 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items())[0] 
mapper[trees[0].root] = Call(' ops_par_loop ', j ) 

iet = Transformer(mapper).visit(iet) 
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step 2 - Translation to OPS kernels 
iii. Invocation 


mapper = {} 

section, trees = list(find_affine_trees(iet).items 
mapper[trees[0].root] = Call(' ops_par_loop ',£ 
iet = Transformer(mapper).visit(iet) 


)[ 0 ] 


iEW 

1 S 

41 W 

. f 
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step 2 - Translation to OPS kernels 
iii. Invocation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 


for (int time = time_m; time <= time_M; time += 1) 

{ 

for (int x = x_m; x <= x_M; x += 1 ) 

{ 

for (int y = y_m; y <= y_M; y += 1 ) 

{ 

v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] + 1 

} 

} 


return 0; 
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step 2 - Translation to OPS kernels 
iii. Invocation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

<« - 


ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[t1 ], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 




return 0; 
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Identify the computation bulk in Devito kernels V 
Translate it into an OPS kernel 

i. Preparation 

ii. Declarations 
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Identify the computation bulk in Devito kernels Y 
Translate it into an OPS kernel 

i. Preparation 

ii. Declarations 
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1. Identify the computation bulk in Devito kernels 

2. Translate it into an OPS kernel 


i. Preparation 


ii. 

iii. 


Declarations 


Invocation 



IET 


Devito 

Expression 




















step 2 - Translation to OPS kernels 


1. Preparation 

^mp/dev^^^^cach^Ti^^999^^^^e9389a^783ec78b6^^^^^^c^c544^^q 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int const int time_m, const int x_M, 

const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl ], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

return 0; 


} 
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step 2 - Translation to OPS kernels 
i. Preparation 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl ], 1, S2D_VT1_1PT, "float", 0PS_WRITE)) 

} 

return 0; 

} 





step 2 - Translation to OPS kernels 
i. Preparation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

return 0; 


} 
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step 2 - Translation to OPS kernels 
i. Preparation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 

#define 0PS_2D 
#include <ops_seq.h> 

#include "my_ops_kernels.h" 


int Kernel(float * v, const int const int time_m, const int x_M, 

const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

return 0; 

} 











step 2 - Translation to OPS kernels 


i. Preparation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.c 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 

#define 0PS_2D 
#include <ops_seq.h> 

#include "my_ops_kernels.h" 


int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 


ops_init(0,0,1); 


diagnosis level 


for (int time = time_m; time <= time_M; time += 1) 


{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", 0PS_READ), 


ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 
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step 2 - Translation to OPS kernels 
i. Preparation 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.cpp| 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 

#define 0PS_2D 
#include <ops_seq.h> 

#include "my_ops_kernels.h" 


I int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

ops_init(0,0,1); 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", OPS.WRITE)); 

} 

ops_exit(); 
return 0; 

} 
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Identify the computation 
Translate it into an OPS 
i. Preparation V 
ii. Declarations 
iii. Invocation V 


bulk in Devito kernels 
kernel 




















step 2 - Translation to OPS kernels 
ii. Declarations 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 

int Kernel(float * v, const int time_M, const int time_m, const int x_M, 
const int x_m, const int y_M, const int y_m) 

{ 

for (int time = time_m; time <= time_M; time += 1) 

{ 

for (int x = x_m; x <= x_M; x += 1) 

{ 

for (int y = y_m; y <= y_M; y += 1) 

{ 

v[time + 1][x + 1][y + 1] = v[time][x + 1][y + 1] +1; 

} 

} 

} 

return 0; 


} 
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<Callable Kernel> 

<List (0, 2, 0)> 

<ArrayCast> 

<List (3, 1, 0)> 

<C.Comment /* Flush denormal numbers to zero in hardware */> 

<C.Statement _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);> 

<C.Statement _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);> 

<List (0, 1, 0)> 

<[affine, sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 
<TimedList (2, 1, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1) : : (0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1): : (0, 0)> 
<ExpressionBundle (1)> 

<Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/1000000> 


<Callable Kernel> 

<List (0, 2, 0)> 

<A rrayCast> _ 

<List (3, 1, 0)> 

<C.Comment 7 * Flush denormal numbers to zero in hardware */> 

<C : Statement MM .SET DENORMALS ZERO MODE ( MM DENORKALS ZERO ON/) :> 

pList (0, 1, 0)> 

<[affine,sequential] Iteration time::time::(time_m, time_M, 1)::(0, 0)> 

< I imedList (2, T, 2)> 

<C.Statement struct timeval start_section0, end_section0;> 

<C.Statement gettimeofday(&start_section0, NULL);> 

<Section (1)> 

<[affine,parallel] Iteration x::x::(x_m, x_M, 1) : : (0, 0)> 

<[affine,parallel,vector-dim] Iteration y::y::(y_m, y_M, 1): : (0, 0)> 
<ExpressionBundle (1)> 

Expression v[time +1, x+1, y+1] = v[time, x+1, y+1] +1> 


<C.Statement gettimeofday(&end_section0, NULL);> 

<C.Statement timers->section0 += (end_section0.tv_sec-start_section0.tv_sec)/1000000; 


























expr = Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1> 


DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] + 1; 



expr 


Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1> 


DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] + 1; 

DEVITO_BACKEND=ops: 

" vtl(0, 0) = vt0(0, 0) + 1 ; " 



expr 


= Expression 


v[time +1, x + 1, y + 1] = v[time, x+1, y+1] +14 


expr.lhs 


expr.rhs 


DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] + 1; 

DEVITO_BACKEND=ops: 

" I vtl(0, 0) = vt0(0, 0) + 1 ; I" 




expr = Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1 

indexed expr.rhs 

DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] +1; " 

DEVITO_BACKEND=ops: 

" I vt1(0, 0)1=|vt0(0, 0) + 1; I " 




expr = Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] J + Li 

indexed expr.rhs.lhs expr.rhs.rhs 

DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1 ] = v[time][x + 1 ][y + 1] +1; " 

DEVITO_BACKEND=ops: 

" I vtl(0, 0)1 = I vt0(0, 0) I +11; I " 




expr = Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1]l+jl> 

indexed indexed number 

DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] +1; " 

DEVITO_BACKEND=ops: 

" Ivt1(0, 0)1 = I vt0(0, 0) I +11; I " 




expr = Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1 ] 1+ 1> 

indexed indexed 

DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] +1; " 

DEVITO_BACKEND=ops: 

" I vt1(0, 0)1 = |vt0(0, 0)1 + 11 ; " 




expr = Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1> 

name indices name indices 

DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] +1; " 

DEVITO_BACKEND=ops: 

" I vt1(0, 0)1 = |vt0(0, 0)1 + 11 ; " 




expr 


= Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1> 

time space time space 

DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1 ] = v[time][x + 1 ][y + 1] +1; " 

DEVITO_BACKEND=ops: 

"|vt1(0, 0) _ vt0(0, 0) T'l; " 




expr = Expression v[time +1, x+1, y+1] = v[time, 

time space time 

DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1 
DEVITO_BACKEND=ops: 

"|vt1(0, 0) _ vt0(0, 0) T'l; " 


space 


' split_affine_ 

'shift '1311351 



expr 


Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1> 


' split_affine_ 
'shift 


DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] + 1; 


DEVITO_BACKEND=ops: 



expr 


Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1> 


DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] + 1; 

DEVITO_BACKEND=ops: 

"Ivtl(0, 0) = vt0(0, 0) + iTI" 




expr 


Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1> 


DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] + 1; 

DEVITO_BACKEND=ops: 

"Ivtl(0, 0) = vt0(0, 0) + iTI" 

























expr 


Expression v[time + 1, x + 1, y + 1] = v[time, x + 1, y + 1] +1> 


DEVITO_BACKEND=core: 

" v[time + 1 ][x + 1 ][y + 1] = v[time][x + 1 ][y + 1] + 1; 

DEVITO_BACKEND=ops: 

"Ivtl(0, 0) = vt0(0, 0) + iTI" 

























vt1(0, 0) = vt0(0, 0) + 1; 


























void OPS_Kernel_0(const ACC<float> & vt0, ACC<float> & vtl) 

{. _ 

vt1(0, 0) = vt0(0, 0) + 1; 

} 























void OPS_Kernel_0(const ACC<float> & vt0, ACC<float> & vtl) 

{. _ 

vt1(0, 0) = vt0(0, 0) + 1; 

} 
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step 2 - Translation to OPS kernels 
ii. Declarations 









step 


2 - Translation to OPS kernels 
ii. Declarations 








step 2 - Translation to OPS kernels 
ii. Declarations 


/tmp/devito-jitcache-uidl000/15e19e9389a27783ec78b6714131 clcc4fc5449f.cpp| 


#define _P0SIX_C_S0URCE 200809L 
#include "stdlib.h" 

#include "math.h" 

#define 0PS.2D 
#include <ops_seq.h> 

#include "my_ops_kernels.h” 

int Kernel(float * v, const int const int time_m, const int x_M, 

const int x_m, const int y_M, const int y_m) 

{ 

ops_init(0,0,1); 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", 0PS_READ), 
ops_arg_dat(v_dat[t1], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


} 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 


for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


} 
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step 2 - Translation to OPS kernels 
ii. Declarations 


ops.init (0,0,1); n0 - of dimmensions 

ops_block block = ops_decl_block(2, "block"); 


for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


} 


110 



step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 
ops_stencil S2D_VT0_1PT 
ops_stencil S2D_VT1_1PT 


for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


} 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops.stencil S2D_VT0_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT0_1PT") ; 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT1_1PT") ; 

no. dimensions, 
no. of points, 
points, 
name 


for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", 0PS_READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


} 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 
ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1 


for (int time = time_m; time <= time_M; time += 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , bl 
ops_arg_dat(v_dat [t0], 1, S2D_VT 
ops_arg_dat (v_dat [t1], 1, S2D_VT 

} 

ops_exit(); 
return 0; 

} 























step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 
ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1 


for (int time = time_m; time <= time_M; time += 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , bl 
ops_arg_dat (v_dat [t0], 1, S2D_VT 
ops_arg_dat(v_dat [tl], 1, S2D_VT 

} 

ops_exit(); 
return 0; 


} 























step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 
ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1 


for (int time = time_m; time <= time_M; time += 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , bl 
ops_arg_dat (v_dat [t0], 1, S2D_VT 
ops_arg_dat(v_dat [tl], 1, S2D_VT 

} 

ops_exit(); 
return 0; 


} 

























step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops.stencil S2D_VT0_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT0_1PT") ; 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT1_1PT") ; 


). 

)}; 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", 0PS_READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


ops_dat v_dat[] = 
{ops_decl_dat( 
ops_decl_dat( 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops.stencil S2D_VT0_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT0_1PT") ; 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT1_1PT") ; 


). 

)}; 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", 0PS_READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


ops_dat v_dat[ ] - no 0 f items per point, 

{ops_decl_dat(block, 1, 
ops_decl_dat(block, 1, 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops.stencil S2D_VT0_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT0_1PT") ; 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT1_1PT") ; 
int v_dim[2] = {4, 4}; B3iPT»T?8 


ops_dat v_dat[] = 

{ops_decl_dat(block, 1, v_dim, 
ops_decl_dat(block, 1, v_dim, 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 



step 2 - Translation to OPS kernels 
ii. Declarations 


ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT0_1PT"); 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT1_1PT"); 


int v_dim[2] = {4, 
int v_base[2] = {0, 

4},- 
- 0}; 



ops_dat v_dat[ ] = 
{ops_decl_dat(block, 

1, v_dim, 

, v_base, 

). 

ops_decl_dat(block, 

1, v_dim, 

, v_base, 

)} 


for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", 0PS_READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


} 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops.stencil S2D_VT0_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT0_1PT") ; 
ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT1_1PT") ; 
int v_dim[2] = {4, 4}; 
int v_base[2] = {0, 0}; 
int v_d_p[2] = {1, 1}; 

ops_dat v_dat[ ] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 


' s hape T -SCr 1 
^padding + halo* 


step 2 - Translation to OPS kernels 
ii. Declarations 


ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 
ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1 
ops_stencil S2D_VT1_1PT = op s_decl_ stencil(2, 1 
int v_dim[2] = {4, 4}; 
int v_base[2] = {0, 0}; 
int v_d_p[2] = {1, 1}; 
int v_d_m[2] = {-1, -1}; 


shape' 

0s| 

'padding + halo 
(padding + halo) 


{ 0 , 0 }, 
{ 0 , 0 }, 


int v_d_m[2] = {-1, -1}; 
ops_dat v_dat[ ] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d_p, 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d_p, 


"S2D_VT0_1PT"); 
"S2D_VT1 _1 PT"); 



for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", 0PS_READ), 
ops_arg_dat(v_dat[tl], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit(); 
return 0; 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 
ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 
ops_stencil S2D_VT1_1PT = op s_decl_ stencil(2, 
int v_dim[2] = {4, 4}; 
int v_base[2] = {0, 0}; 
int v_d_p[2] = {1, 1}; 
int v_d_m[2] = {-1, -1}; 
ops_dat v_dat[] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, 


'shape 
0s 

'padding + halo 
' -(padding + halo)' 


{0, 0}, "S2D_VT0_1PT"); 
{0, 0}, "S2D_VT1_1PT"); 


v_d_p, &v[0][0][0], "float" 
v_d_p, &v[1][0][0], "float" 


"vt0") , 
"vtl")} 


for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[t1], 1, S2D_VT1_1PT, "float", OPS.WRITE)); 

} 

ops_exit(); 
return 0; 


step 2 - Translation to OPS kernels 
ii. Declarations 


{ 


ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1 

ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1 

int v_dim[2] = {4, 4}; 

int v_base[2] = {0, 0}; 

int v_d_p[2] = {1, 1}; 

int v_d_m[2] = {-1, -1}; 

ops_dat v_dat[] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d 

for (int time = time_m; time <= time_M; time += 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , bl 
ops_arg_dat(v_dat [t0], 1, S2D_VT 
ops_arg_dat (v_dat [t1], 1, S2D_VT 

} 

ops_exit(); 
return 0; 

} 

























step 2 - Translation to OPS kernels 
ii. Declarations 


{ 


ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1 

ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1 

int v_dim[2] = {4, 4}; 

int v_base[2] = {0, 0}; 

int v_d_p[2] = {1, 1}; 

int v_d_m[2] = {-1, -1}; 

ops_dat v_dat[] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d 

for (int time = time_m; time <= time_M; time += 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , bl 
ops_arg_dat(v_dat [t0], 1, S2D_VT 
ops_arg_dat (v_dat [t1], 1, S2D_VT 

} 

ops_exit(); 
return 0; 

} 


























step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops.stencil S2D_VT0_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT0_1PT") ; 

ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT1_1PT") ; 

int v_dim[2] = {4, 4}; 

int v_base[2] = {0, 0}; 

int v_d_p[2] = {1, 1}; 

int v_d_m[2] = {-1, -1}; 

ops_dat v_dat[] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d_p, &v[0][0][0], "float", "vt0"), 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d_p, &v[1][0][0], "float", "vtl")}; 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl ], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit (); 
return 0; 


} 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 

ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops.stencil S2D_VT0_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT0_1PT") ; 

ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1, {0, 0}, "S2D_VT1_1PT") ; 

int v_dim[2] = {4, 4}; 

int v_base[2] = {0, 0}; 

int v_d_p[2] = {1, 1}; 

int v_d_m[2] = {-1, -1}; 

ops_dat v_dat[] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d_p, &v[0][0][0], "float", "vt0"), 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d_p, &v[1][0][0], "float", "vtl")}; 

for (int time = time_m; time <= time_M; time += 1) 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , block, 2, {x_m, x_M, y_m, y_M}, 
ops_arg_dat(v_dat[t0], 1, S2D_VT0_1PT, "float", OPS.READ), 
ops_arg_dat(v_dat[tl ], 1, S2D_VT1_1PT, "float", 0PS_WRITE)); 

} 

ops_exit (); 
return 0; 


} 
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step 2 - Translation to OPS kernels 
ii. Declarations 


{ 


ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1 

ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1 

int v_dim[2] = {4, 4}; 

int v_base[2] = {0, 0}; 

int v_d_p[2] = {1, 1}; 

int v_d_m[2] = {-1, -1}; 

ops_dat v_dat[] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d 

for (int time = time_m; time <= time_M; time += 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , bl 
ops_arg_dat(v_dat [t0], 1, S2D_VT 
ops_arg_dat (v_dat [t1], 1, S2D_VT 

} 

ops_exit(); 
return 0; 

} 


























step 2 - Translation to OPS kernels 
ii. Declarations 


{ 


ops_init(0,0,1); 

ops_block block = ops_decl_block(2, "block"); 

ops_stencil S2D_VT0_1PT = ops_decl_stencil(2, 1 

ops_stencil S2D_VT1_1PT = ops_decl_stencil(2, 1 

int v_dim[2] = {4, 4}; 

int v_base[2] = {0, 0}; 

int v_d_p[2] = {1, 1}; 

int v_d_m[2] = {-1, -1}; 

ops_dat v_dat[] = 

{ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d 
ops_decl_dat(block, 1, v_dim, v_base, v_d_m, v_d 

for (int time = time_m; time <= time_M; time += 

{ 

ops_par_loop(OPS_Kernel_0, "OPS_Kernel_0" , bl 
ops_arg_dat(v_dat [t0], 1, S2D_VT 
ops_arg_dat (v_dat [t1], 1, S2D_VT 

} 

ops_exit(); 
return 0; 

} 
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3D Acoustic Wave Propagation Prototype 

Time step: 0.001s 

Total time: 30s 

Domain size: 1km x 1km x 1km 

Ricker source peak frequency: 10Hz 

Single layer velocity model of 2km/s 

Absorbing boundary 

Grid points: 32 3 , 64 3 , 128 3 ,256 3 , 512 3 
4th Space order 



3D Acoustic Wave Propagation Prototype 


Time step: 0.001s 

Total time: 30s 

Domain size: 1km x 1km x 1km 

Ricker source peak frequency: 10Hz 

Single layer velocity model of 2km/s 

Absorbing boundary 

Grid points: 32 3 , 64 3 , 128 3 ,256 3 , 512 3 
4th Space order 
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Full code generation + JIT compilation chain for a 2D FWI 
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Current efforts 


Full code generation + JIT compilation chain for a 2D FWI 
prototype 


Overcoming abusing CPU-GPU data movement 
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https://www.devitoproject.org/ 


https://github.com/opesci/devito 


https://opesci-slackin.now.sh/ 


https ://twitter.com/opesci project 


144 


